package reaction.utils.string;

import java.util.*;
import java.io.*;
import java.text.*;

/**
 * The ExtractAbbrev class implements a simple algorithm for extraction of
 * abbreviations and their definitions from biomedical text. Abbreviations
 * (short forms) are extracted from the input file, and those abbreviations for
 * which a definition (long form) is found are printed out, along with that
 * definition, one per line.
 * 
 * A file consisting of short-form/long-form pairs (tab separated) can be
 * specified in tandem with the -testlist option for the purposes of evaluating
 * the algorithm.
 * 
 * @see <a href="http://biotext.berkeley.edu/papers/psb03.pdf">A Simple
 *      Algorithm for Identifying Abbreviation Definitions in Biomedical
 *      Text</a> A.S. Schwartz, M.A. Hearst; Pacific Symposium on Biocomputing
 *      8:451-462(2003) for a detailed description of the algorithm.
 * 
 * @author Ariel Schwartz
 * @version 03/12/03
 */
public class ExtractAbbrev {

	HashMap mTestDefinitions = new HashMap();
	HashMap mStats = new HashMap();
	int truePositives = 0, falsePositives = 0, falseNegatives = 0, trueNegatives = 0;
	char delimiter = '\t';
	boolean testMode = false;

	private boolean isValidShortForm(String str) {
		return (hasLetter(str) && (Character.isLetterOrDigit(str.charAt(0)) || (str
				.charAt(0) == '(')));
	}

	private boolean hasLetter(String str) {
		for (int i = 0; i < str.length(); i++)
			if (Character.isLetter(str.charAt(i)))
				return true;
		return false;
	}

	private boolean hasCapital(String str) {
		for (int i = 0; i < str.length(); i++)
			if (Character.isUpperCase(str.charAt(i)))
				return true;
		return false;
	}

	private void loadTrueDefinitions(String inFile) {
		String abbrString, defnString, str = "";
		Vector entry;
		HashMap definitions = mTestDefinitions;

		try {
			BufferedReader fin = new BufferedReader(new FileReader(inFile));
			while ((str = fin.readLine()) != null) {
				int j = str.indexOf(delimiter);
				abbrString = str.substring(0, j).trim();
				defnString = str.substring(j, str.length()).trim();
				entry = (Vector) definitions.get(abbrString);
				if (entry == null)
					entry = new Vector();
				entry.add(defnString);
				definitions.put(abbrString, entry);
			}
		} catch (Exception e) {
			e.printStackTrace();
			System.out.println(str);
		}
	}

	private boolean isTrueDefinition(String shortForm, String longForm) {
		Vector entry;
		Iterator itr;

		entry = (Vector) mTestDefinitions.get(shortForm);
		if (entry == null)
			return false;
		itr = entry.iterator();
		while (itr.hasNext()) {
			if (itr.next().toString().equalsIgnoreCase(longForm))
				return true;
		}
		return false;
	}

	public Vector<Abbreviations> extractAbbrPairs(String inFile) {

		String str, tmpStr, longForm = "", shortForm = "";
		String currSentence = "";
		int openParenIndex, closeParenIndex = -1, sentenceEnd, newCloseParenIndex, tmpIndex = -1;
		boolean newParagraph = true;
		StringTokenizer shortTokenizer;
		Vector<Abbreviations> candidates = new Vector<Abbreviations>();

		try {
			//BufferedReader fin = new BufferedReader(new FileReader(inFile));
			BufferedReader fin = new BufferedReader(new StringReader(inFile));
			
			while ((str = fin.readLine()) != null) {
				if (str.length() == 0 || newParagraph
						&& !Character.isUpperCase(str.charAt(0))) {
					currSentence = "";
					newParagraph = true;
					continue;
				}
				newParagraph = false;
				str += " ";
				currSentence += str;
				openParenIndex = currSentence.indexOf(" (");
				do {
					if (openParenIndex > -1)
						openParenIndex++;
					sentenceEnd = Math.max(currSentence.lastIndexOf(". "),
							currSentence.lastIndexOf(", "));
					if ((openParenIndex == -1) && (sentenceEnd == -1)) {
						// Do nothing
					} else if (openParenIndex == -1) {
						currSentence = currSentence.substring(sentenceEnd + 2);
					} else if ((closeParenIndex = currSentence.indexOf(')',
							openParenIndex)) > -1) {
						sentenceEnd = Math.max(
								currSentence.lastIndexOf(". ", openParenIndex),
								currSentence.lastIndexOf(", ", openParenIndex));
						if (sentenceEnd == -1)
							sentenceEnd = -2;
						longForm = currSentence.substring(sentenceEnd + 2,
								openParenIndex);
						shortForm = currSentence.substring(openParenIndex + 1,
								closeParenIndex);
					}
					if (shortForm.length() > 0 || longForm.length() > 0) {
						if (shortForm.length() > 1 && longForm.length() > 1) {
							if ((shortForm.indexOf('(') > -1)
									&& ((newCloseParenIndex = currSentence
											.indexOf(')', closeParenIndex + 1)) > -1)) {
								shortForm = currSentence.substring(
										openParenIndex + 1, newCloseParenIndex);
								closeParenIndex = newCloseParenIndex;
							}
							if ((tmpIndex = shortForm.indexOf(", ")) > -1)
								shortForm = shortForm.substring(0, tmpIndex);
							if ((tmpIndex = shortForm.indexOf("; ")) > -1)
								shortForm = shortForm.substring(0, tmpIndex);
							shortTokenizer = new StringTokenizer(shortForm);
							if (shortTokenizer.countTokens() > 2
									|| shortForm.length() > longForm.length()) {
								// Long form in ( )
								tmpIndex = currSentence.lastIndexOf(" ", openParenIndex - 2);
								tmpStr = currSentence.substring(tmpIndex + 1, openParenIndex - 1);
								longForm = shortForm;
								shortForm = tmpStr;
								if (!hasCapital(shortForm))
									shortForm = "";
							}
							if (isValidShortForm(shortForm)) {
								extractAbbrPair(shortForm.trim(),longForm.trim(), candidates);								
							}
						}
						currSentence = currSentence.substring(closeParenIndex + 1);
					} else if (openParenIndex > -1) {
						if ((currSentence.length() - openParenIndex) > 200)
							// Matching close paren was not found
							currSentence = currSentence.substring(openParenIndex + 1);
						break; // Read next line
					}
					shortForm = "";
					longForm = "";
				} while ((openParenIndex = currSentence.indexOf(" (")) > -1);
			}
			fin.close();
			//reader.close();
		} catch (Exception ioe) {
			ioe.printStackTrace();
			System.out.println(currSentence);
			System.out.println(tmpIndex);
		}
		return candidates;
	}

	private String findBestLongForm(String shortForm, String longForm) {
		int sIndex;
		int lIndex;
		char currChar;

		sIndex = shortForm.length() - 1;
		lIndex = longForm.length() - 1;
		for (; sIndex >= 0; sIndex--) {
			currChar = Character.toLowerCase(shortForm.charAt(sIndex));
			if (!Character.isLetterOrDigit(currChar))
				continue;
			while (((lIndex >= 0) && (Character.toLowerCase(longForm
					.charAt(lIndex)) != currChar))
					|| ((sIndex == 0) && (lIndex > 0) && (Character
							.isLetterOrDigit(longForm.charAt(lIndex - 1)))))
				lIndex--;
			if (lIndex < 0)
				return null;
			lIndex--;
		}
		lIndex = longForm.lastIndexOf(" ", lIndex) + 1;
		return longForm.substring(lIndex);
	}

	private void extractAbbrPair(String shortForm, String longForm, Vector<Abbreviations> candidates) {
		String bestLongForm;
		StringTokenizer tokenizer;
		int longFormSize, shortFormSize;

		if (shortForm.length() == 1)
			return;
		bestLongForm = findBestLongForm(shortForm, longForm);
		if (bestLongForm == null)
			return;
		tokenizer = new StringTokenizer(bestLongForm, " \t\n\r\f-");
		longFormSize = tokenizer.countTokens();
		shortFormSize = shortForm.length();
		for (int i = shortFormSize - 1; i >= 0; i--)
			if (!Character.isLetterOrDigit(shortForm.charAt(i)))
				shortFormSize--;
		if (bestLongForm.length() < shortForm.length()
				|| bestLongForm.indexOf(shortForm + " ") > -1
				|| bestLongForm.endsWith(shortForm)
				|| longFormSize > shortFormSize * 2
				|| longFormSize > shortFormSize + 5 || shortFormSize > 10)
			return;

		if (testMode) {
			if (isTrueDefinition(shortForm, bestLongForm)) {
				System.out.println(shortForm + delimiter + bestLongForm
						+ delimiter + "TP");
				truePositives++;
			} else {
				falsePositives++;
				System.out.println(shortForm + delimiter + bestLongForm
						+ delimiter + "FP");
			}
		} else {
			candidates.add(new Abbreviations(shortForm,bestLongForm));
		}
	}

	private static void usage() {
		System.err.println("Usage: ExtractAbbrev [-options] <filename>");
		System.err
				.println("       <filename> contains text from which abbreviations are extracted");
		System.err
				.println("       -testlist <file> = list of true abbreviation definition pairs");
		System.err.println("       -usage or -help = this message");
		System.exit(1);
	}

	public static void main(String[] args) {
		String shortForm, longForm, defnString, str;
		ExtractAbbrev extractAbbrev = new ExtractAbbrev();
		Vector candidates;
		String[] candidate;
		String filename = null;
		String testList = null;

		// parse arguments
		for (int i = 0; i < args.length; i++) {
			if (args[i].equals("-testlist")) {
				if (i == args.length - 1) {
					usage();
				}
				testList = args[++i];
				extractAbbrev.testMode = true;
			} else if (args[i].equals("-usage")) {
				usage();
			} else if (args[i].equals("-help")) {
				usage();
			} else {
				filename = args[i];
				// Must be last arg
				if (i != args.length - 1) {
					usage();
				}
			}
		}
		if (filename == null) {
			usage();
		}

		if (extractAbbrev.testMode)
			extractAbbrev.loadTrueDefinitions(testList);

		Vector<Abbreviations> candid = extractAbbrev.extractAbbrPairs(filename);
		
		for (Abbreviations abbreviations : candid) {
			System.out.println(abbreviations.shortForm+'\t'+abbreviations.longForm);
		}
		if (extractAbbrev.testMode)
			System.out.println("TP: " + extractAbbrev.truePositives + " FP: "
					+ extractAbbrev.falsePositives + " FN: "
					+ extractAbbrev.falseNegatives + " TN: "
					+ extractAbbrev.trueNegatives);
	}
}
